
import pandas as pd
import numpy as np
import geopandas as gpd
import requests
from bs4 import BeautifulSoup
import os
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import folium
from folium.plugins import MarkerCluster
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
url = "https://en.wikipedia.org/wiki/Busan"
extracting_data = requests.get(url).text
tds = BeautifulSoup(extracting_data, 'lxml').find('table', {'class': 'sortable'}).findAll('td')
busan_rows = []
busan_columns = ['Subdivision', 'Korean', 'Area', 'Population']
for i in range(0, len(tds), 4):
busan_rows.append({
busan_columns[0]:tds[i].text.strip(),
busan_columns[1]:tds[i+1].text.strip(),
busan_columns[2]:tds[i+2].text.strip(),
busan_columns[3]:int((tds[i+3].text.strip()).replace(',', ''))
})
busan_data = pd.DataFrame(busan_rows, columns=busan_columns)
busan_data = busan_data.drop(busan_data[['Korean']], axis=1)
busan_data
nil = gpd.read_file('busan.geojson')
nil = nil.drop(nil[['SIG_CD']], axis=1)
nil = nil.rename(columns = {'SIG_ENG_NM':'Subdivision', 'SIG_KOR_NM':'Korean'})
# Merge Busan geojson and table
busan_data = nil.merge(busan_data, on='Subdivision')
busan_data
def build_map(data, map_type):
busan = 'Busan, Korea'
geolocator = Nominatim(user_agent='busan_explorer')
latitude = geolocator.geocode(busan).latitude
longitude = geolocator.geocode(busan).longitude
busan_map = folium.Map(
location=[latitude, longitude],
zoom_start=11
)
if map_type == 'default':
busan_map.choropleth(
geo_data=data,
data=data,
key_on='feature.properties.Subdivision',
columns= ['Subdivision', 'Population'],
fill_color='YlOrRd',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='Population in Busan'
)
else:
busan_map.choropleth(
geo_data=data,
data=data,
key_on='feature.properties.Subdivision',
columns=['Subdivision', 'Cluster Labels'],
fill_color='Pastel1',
fill_opacity=0.7,
line_opacity=0.3,
threshold_scale=[0, 1, 2, 3, 4],
legend_name='Cluster Labels'
)
return busan_map
def build_tooltip(busan_map, data, fields, aliases):
style_function = lambda x: {'fillColor': '#ffffff',
'color':'#000000',
'fillOpacity': 0.1,
'weight': 0.1}
highlight_function = lambda x: {'fillColor': '#000000',
'color':'#000000',
'fillOpacity': 0.50,
'weight': 0.1}
geo = folium.GeoJson(
data,
style_function=style_function,
control=False,
highlight_function=highlight_function,
tooltip=folium.features.GeoJsonTooltip(
fields=fields,
aliases=aliases,
style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;")
)
)
busan_map.add_child(geo)
busan_map.keep_in_front(geo)
folium.LayerControl().add_to(busan_map)
busan_map = build_map(busan_data, 'default')
tooltip = build_tooltip(busan_map, busan_data, ['Subdivision', 'Population'], ['Subdivision: ','Population: '])
busan_map
CLIENT_ID = 'VF0AUJWBTE2RYQHKRCCI31WJEVCKTRR4DTQOBEPBXIZYPLK2'
CLIENT_SECRET = 'MUGUC4RBSDWTLI4W2JSYO2Y2IDCCUOEXRGXVWJSLP11Y03XM'
VERSION = '20200711'
RADIUS = 1000
LIMIT = 100
def getNearbyVenues(subdivision, korean):
venues = []
for sub, kor in zip(subdivision, korean):
busa_kor = '부산광역시, 대한민국'
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&near={}, {}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
kor,
busa_kor
)
results = requests.get(url).json()["response"]['groups'][0]['items']
venues.append([(
sub,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venues in venues for item in venues])
nearby_venues.columns = ['Subdivision',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return (nearby_venues)
busan_venues = getNearbyVenues(subdivision=busan_data['Subdivision'],
korean=busan_data['Korean'])
print(busan_venues.shape)
busan_venues.hean()
venues = busan_venues.groupby('Venue Category').count().sort_values(by='Venue', ascending=False).head(10)
venues.plot.bar(y="Venue", use_index=True, rot=70, title="Top 10 highest number of venues in Busan", figsize=(15,5));
print('There are {} uniques categories.'.format(len(busan_venues['Venue Category'].unique())))
# one hot encoding
busan_onehot = pd.get_dummies(busan_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
busan_onehot['Subdivision'] = busan_venues['Subdivision']
# move neighborhood column to the first column
busan_onehot = busan_onehot[['Subdivision'] + [col for col in busan_onehot.columns if col != 'Subdivision']]
print(busan_onehot.shape)
busan_onehot.head()
busan_grouped = busan_onehot.groupby('Subdivision').sum().reset_index()
print(busan_grouped.shape)
busan_grouped.head()
def return_most_common_venues(row, num_top_venues):
row_categories = row.iloc[1:]
row_categories_sorted = row_categories.sort_values(ascending=False)
return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Subdivision']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
subdivision_venues_sorted = pd.DataFrame(columns=columns)
subdivision_venues_sorted['Subdivision'] = busan_grouped['Subdivision']
for ind in np.arange(busan_grouped.shape[0]):
subdivision_venues_sorted.iloc[ind, 1:] = return_most_common_venues(busan_grouped.iloc[ind, :], num_top_venues)
subdivision_venues_sorted.head()
busan_grouped_clustering = busan_grouped.drop('Subdivision', 1)
Clus_dataSet = StandardScaler().fit_transform(busan_grouped_clustering)
distortion = []
for k in range(2, 6):
kmeans = KMeans(n_clusters = k).fit(Clus_dataSet)
predict = kmeans.fit_predict(Clus_dataSet)
distortion.append(silhouette_score(Clus_dataSet, predict, metric = 'euclidean'))
plt.plot(range(2, 6), distortion)
plt.title('Silhouette Method')
plt.xlabel('Number of clusters')
plt.show()
# Optimal K is 4
kmeans = KMeans(n_clusters=4, random_state=0).fit(Clus_dataSet)
kmeans.labels_
subdivision_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
busan_merged = pd.merge(busan_data, subdivision_venues_sorted.set_index('Subdivision'), on='Subdivision')
busan_merged
busan_merged.loc[busan_merged['Cluster Labels'] == 0, busan_merged.columns[[0] + list(range(6, busan_merged.shape[1]))]]
busan_merged.loc[busan_merged['Cluster Labels'] == 1, busan_merged.columns[[0] + list(range(6, busan_merged.shape[1]))]]
busan_merged.loc[busan_merged['Cluster Labels'] == 2, busan_merged.columns[[0] + list(range(6, busan_merged.shape[1]))]]
busan_map_data = pd.merge(busan_merged, busan_venues, on='Subdivision')
busan_map = build_map(busan_merged, 'mixed')
tooltip = build_tooltip(busan_map, busan_merged, ['Subdivision', 'Population', 'Cluster Labels'], ['Subdivision: ','Population: ', 'Cluster: '])
marker_cluster = MarkerCluster(
name='Venues cluster',
overlay=True,
control=False,
icon_create_function=None
)
latituds = busan_map_data['Venue Latitude']
longitudes = busan_map_data['Venue Longitude']
subdivisions = busan_map_data['Subdivision']
categoris = busan_map_data['Venue Category']
labels = busan_map_data['Cluster Labels']
for lat, lon, sub, category, cluster in zip(latituds, longitudes, subdivisions, categoris, labels):
label = folium.Popup('<strong>{}</strong><br><strong>Category</strong>: {}<br><strong>Cluster</strong>: {}'.format(sub, category, cluster), max_width=2650)
marker = folium.Marker(
location=[lat, lon],
popup=label,
icon=None,
)
marker_cluster.add_child(marker)
marker_cluster.add_to(busan_map)
busan_map